IMPORT LIBRAIRIES

IMPORT DATA

# Import metadata
metadata_table <- read.table("/Users/enigma/Desktop/Munich/Praktikum/Data/Pozuelo_2015/PozueloSraRunTable.txt", header = TRUE,
                             sep = ",")
# Keep only relevant metadata
metadata_table <- metadata_table[, c("Run", "Sample.Name")]
rownames(metadata_table) <- metadata_table$Run # assign sample names as row names
metadata_table$Sample.Name <- as.character(metadata_table$Sample.Name) # Sample.Name is of class "factor", transform it to characters
metadata_table$Collection <- "1st" # add column to specify if sample is from first or 2nd collection
metadata_table[gsub('.*(?=.{2}$)', '', metadata_table$Sample.Name, perl=T) == '.2', 'Collection'] <- "2nd"
metadata_table$host_disease <- "Healthy" # add column for host_disease
metadata_table[substr(metadata_table$Sample.Name, 1, 2) == "MO", 'host_disease'] <- "IBS"
metadata_table[, 'Sample.Name'] <- gsub("MO", "IBS", metadata_table[, 'Sample.Name']) # replace 'MO' by 'IBS' in the sample.names
metadata_table$Sample.Name <- gsub("\\.2$", "", metadata_table[, 'Sample.Name']) # remove the '.2' at the end of some of the sample.names (indicating it is the 2nd collection)

saveRDS(metadata_table, "/Users/enigma/Desktop/Munich/Praktikum/Data/Pozuelo_2015/metadata_final.rds")


#________________________________________________________________________________________________________
# Import OTU table (rows: sample names // columns: sequence variants)
seqtable.nochim <- readRDS("/Users/enigma/Desktop/Munich/Praktikum/Data/Pozuelo_2015/ASVtable_final.rds")
dim(seqtable.nochim) # should have 290 samples and 6222 ASV
rownames(seqtable.nochim) <- gsub("\\.fastq","", rownames(seqtable.nochim)) # remove the .fastq extension in sample names

# Import Taxonomic table (rows: sequence variants // columns: Kingdom, Phylum, Class, ...)
taxa <- readRDS("/Users/enigma/Desktop/Munich/Praktikum/Data/Pozuelo_2015/taxa_final.rds")
dim(taxa)
##     UniFrac2          JSD     vegdist1     vegdist2     vegdist3     vegdist4 
##   "wunifrac"        "jsd"  "manhattan"  "euclidean"   "canberra"       "bray" 
##     vegdist5     vegdist6     vegdist7     vegdist8     vegdist9    vegdist10 
## "kulczynski"    "jaccard"      "gower"   "altGower"   "morisita"       "horn" 
##    vegdist12    vegdist13    vegdist15 
##       "raup"   "binomial"        "cao"

CREATE PHYLOSEQ OBJECT

#____________________________________________________________________
# OTU TABLE, TAXA TABLE AND METADATA
physeq <- phyloseq(otu_table(seqtable.nochim, taxa_are_rows=FALSE), # by default, in otu_table the sequence variants are in rows
                  sample_data(metadata_table), 
                  tax_table(taxa))

#____________________________________________________________________
# PHYLOGENETIC TREE

# Multiple-sequence alignment (know which regions are conserved/different to be able to do the phylogeny)
seqs <- getSequences(seqtable.nochim) # get the sequence variants (the ASVs)
names(seqs) <- seqs # This propagates to the tip labels of the tree
alignment <- AlignSeqs(DNAStringSet(seqs), anchor=NA) # by default, anchor = 0.7 which means that 70% of sequences must share a common region to anchor the alignment space.

# Construct a neighbor-joining tree
phang.align <- phyDat(as(alignment, "matrix"), type="DNA") # transform the aligned DNA sequences into a phyDat (phangorn) object
dm <- dist.ml(phang.align) # compute pairwise distance between DNA sequences (with the Jukes-Cantor estimate of the evolutionary distance)
treeNJ <- NJ(dm) # create a neighbor-joining tree estimation based on the distance matrix
fit <- pml(treeNJ, data=phang.align) # get the likelihood of the phylogenetic tree given the sequence alignment (then we'll optimize it)

## negative edges length changed to 0!

# Fit a generalized time-reversible with gamma rate variation (GTR+G+I)
fitGTR <- update(fit, k=4, inv=0.2)
fitGTR <- optim.pml(fitGTR, model="GTR", optInv=TRUE, optGamma=TRUE, # gamma rate and proportion of variable size get optimized
                      rearrangement = "stochastic", control = pml.control(trace = 0)) # (trace = 0) don't show output during optimization
# stochastic tree rearrangement

# Add tree to physeq
physeq <- merge_phyloseq(physeq, phy_tree(fitGTR$tree))
# Look at the tree
plot_tree(physeq, color = "host_disease", ladderize="left", sizebase = 2)

#____________________________________________________________________
# GIVE SURNAMES TO OTUs

# Give surnames to sequence variants & store the sequence variants in "refseq" in the phyloseq object
dna <- DNAStringSet(taxa_names(physeq)) # get the sequence variants (ASVs)
names(dna) <- taxa_names(physeq) # no idea what this does
physeq <- merge_phyloseq(physeq, dna) # store the dna sequences in the refseq of the phyloseq object
taxa_names(physeq) <- paste0("OTU", seq(ntaxa(physeq))) # replace the whole dna sequences in the taxa_names by a surname OTU1, OTU2, etc.

# Save physeq object
saveRDS(physeq, "/Users/enigma/Desktop/Munich/Praktikum/Data/Pozuelo_2015/Rproject_DataAnalysis/physeq.rds")

ABSOLUTE AND RELATIVE ABUNDANCES

# Relative abundance for Phylum
phylum.table <- physeq %>%
  tax_glom(taxrank = "Phylum") %>%                     # agglomerate at phylum level
  transform_sample_counts(function(x) {x/sum(x)} ) %>% # Transform to rel. abundance
  psmelt()                                             # Melt to long format

# Relative abundance for Class
class.table <- physeq %>%
  tax_glom(taxrank = "Class") %>%                     # agglomerate at class level
  transform_sample_counts(function(x) {x/sum(x)} ) %>% # Transform to rel. abundance
  psmelt()                                             # Melt to long format


# Plot Phylum
plot_bar(physeq, fill = "Phylum") + facet_wrap("host_disease", scales="free") +
  theme(axis.text.x = element_text(size = 8))+
  labs(x = "Samples", y = "Absolute abundance", title = "Pozuelo dataset (2015)")+
  ylim(0, 45000)

ggplot(phylum.table[phylum.table[,'Kingdom'] == 'Bacteria',], aes(x = Sample.Name, y = Abundance, fill = Phylum))+
  facet_wrap(~ host_disease + Collection, scales = "free") + # scales = "free" removes empty lines
  geom_bar(stat = "identity") +
  theme(axis.text.x = element_text(size = 6, angle = -90))+
  labs(x = "Samples", y = "Relative abundance", title = "1st and 2nd time points")

# Plot Class
plot_bar(physeq, fill = "Class")+ facet_wrap("host_disease", scales="free") +
  theme(axis.text.x = element_text(size = 8))+
  labs(x = "Samples", y = "Absolute abundance", title = "Pozuelo dataset (2015)")+
  ylim(0, 45000)

ggplot(class.table[class.table[,'Kingdom'] == 'Bacteria',], aes(x = Sample, y = Abundance, fill = Class))+
  facet_wrap(~ host_disease, scales = "free") + # scales = "free" removes empty lines
  geom_bar(stat = "identity") +
  theme(axis.text.x = element_text(size = 8, angle = -90))+
  labs(x = "Samples", y = "Relative abundance", title = "Pozuelo dataset (2015)")

# Extract abundance only Bacteroidota and Firmicutes
bacter <- phylum.table[phylum.table[,'Phylum'] == 'Bacteroidota', c('Sample', 'Abundance', 'host_disease',
                                                                    'Collection', 'Phylum')]
bacter <- bacter[order(bacter$Sample),] # order by sample name
firmi <- phylum.table[phylum.table[,'Phylum'] == 'Firmicutes', c('Sample', 'Abundance', 'host_disease',
                                                                 'Collection','Phylum')]
firmi <- firmi[order(firmi$Sample),] # order by sample name

# Calculate log2 ratio Firmicutes/Bacteroidota
ratio_FB <- data.table('Sample' = bacter$Sample,
                       'host_disease' = bacter$host_disease,
                       'Collection' = bacter$Collection,
                       'Bacteroidota' = bacter$Abundance,
                       'Firmicutes' = firmi$Abundance)
ratio_FB[,"Log_ratio_FB"] <- log2(ratio_FB[,"Firmicutes"] / ratio_FB[,"Bacteroidota"])

# Plot log2 ratio Firmicutes/Bacteroidota
par(mar = c(3, 10, 3, 10))
boxplot(data = ratio_FB, Log_ratio_FB ~ host_disease, ylab = "Log2 Ratio Abundance", xlab = "", main = "Firmicutes to Bacteroidota ratio")

NORMALIZE DATA

#____________________________________________________________________
# PHYLOSEQ OBJECT WITH PSEUDOCOUNTS
physeq.pseudocts <- physeq
otu_table(physeq.pseudocts)[otu_table(physeq.pseudocts) == 0] <- 0.5

# check the 0 values have been replaced
otu_table(physeq)[1:5,1:5]
otu_table(physeq.pseudocts)[1:5,1:5]

# save the physeq.pseudocts object
saveRDS(physeq.pseudocts, "/Users/enigma/Desktop/Munich/Praktikum/Data/Pozuelo_2015/Rproject_DataAnalysis/physeq_pseudocts.rds")


#____________________________________________________________________
# PHYLOSEQ OBJECT WITH RELATIVE COUNT (BETWEEN 0 AND 1)
physeq.rel <- physeq.pseudocts
physeq.rel <- transform_sample_counts(physeq.rel, function(x) x / sum(x) ) # normalize the samples at 1 total count per sample

# check the counts are all relative
otu_table(physeq.pseudocts)[1:5, 1:5]
otu_table(physeq.rel)[1:5, 1:5]

# sanity check
sum(otu_table(physeq.rel) < 1) # see how many negative values are present in the matrix
table(rowSums(otu_table(physeq.rel))) # check if there is any row not summing to 1

# save the physeq.rel object
saveRDS(physeq.rel, "/Users/enigma/Desktop/Munich/Praktikum/Data/Pozuelo_2015/Rproject_DataAnalysis/physeq_relative.rds")


#____________________________________________________________________
# PHYLOSEQ OBJECT WITH CENTERED LOG RATIO COUNT
physeq.clr <- physeq.pseudocts
physeq.clr <- transform(physeq.clr, "clr")

# Compare the otu tables in the original phyloseq object and the new one after CLR transformation
otu_table(physeq.pseudocts)[1:5, 1:5] # should contain absolute counts
otu_table(physeq.clr)[1:5, 1:5] # should all be relative

# save the physeq.rel object
saveRDS(physeq.clr, "/Users/enigma/Desktop/Munich/Praktikum/Data/Pozuelo_2015/Rproject_DataAnalysis/physeq_clr.rds")

COMPUTE DISTANCES

#____________________________________________________________________________________
# Measure distances
Distances <- function(physeq_obj){
  set.seed(123) # for unifrac, need to set a seed
  glom.UniF <-  UniFrac(physeq_obj, weighted=TRUE, normalized=TRUE) # weighted unifrac
  glom.ait <- dist(x = otu_table(physeq.clr), method = 'euc') # aitchison
  glom.bray <- phyloseq::distance(physeq_obj, method = "bray") # bray-curtis
  glom.can <- phyloseq::distance(physeq_obj, method = "canberra") # canberra
  glom.gower <- phyloseq::distance(physeq_obj, method = "gower") # gower
  glom.bino <- phyloseq::distance(physeq_obj, method = "binomial") # binomial
  
  dist.list <- list("UniF" = glom.UniF, "Ait" = glom.ait, "Canb" = glom.can, "Bray" = glom.bray,
                    "Gower" = glom.gower, "Binomial" = glom.bino)
  
  return(dist.list)
}


#____________________________________________________________________________________
# Plot 2D ordination
MDS_2D <- function(physeq_obj, ait_dist){
  
  plist <- NULL
  plist <- vector("list", length(dist_methods)+1) # save each plot to a list
  names(plist) <- dist_methods # save the name of each method
  names(plist)[16] <- "aitchison" # save the name of aitchison
  
  # Loop through all distance methods
  for(i in dist_methods){
    # Calculate distance matrix
    #print(i)
    set.seed(123) # in case the distance method needs a rooted tree (weighted unifrac)
    iDist <- phyloseq::distance(physeq_obj, method=i)
    # Calculate ordination
    set.seed(123)
    iMDS  <- ordinate(physeq_obj, "MDS", distance=iDist)
    ## Make plot
    # Don't carry over previous plot (if error, p will be blank)
    p <- NULL
    # Create plot, store as temporary variable, p
    p <- plot_ordination(physeq_obj, iMDS, color="host_disease")
    # Add title to each plot
    p <- p + ggtitle(paste("MDS using distance method ", i, sep=""))
    # Save the graphic to the plot list
    plist[[i]] = p
  }
  
  # Add aitchison
  iMDS  <- ordinate(physeq_obj, "MDS", distance=ait_dist)
  p <- NULL
  p <- plot_ordination(physeq_obj, iMDS, color="host_disease")
  p <- p + ggtitle("MDS using distance method Aitchison")
  plist[[17]] = p
  
  # Creating a dataframe to plot everything
  plot.df = ldply(plist, function(x) x$data)
  names(plot.df)[1] <- "distance"
  
  # Plot
  p.alldist <-  ggplot(plot.df, aes(Axis.1, Axis.2, color=host_disease, shape = Collection))+
    geom_point(size=5, alpha=0.5) +
    scale_color_manual(values = c('blue', 'red'))+
    scale_shape_manual(values = c(16,17))+
    facet_wrap(~distance, scales='free')+
    theme(strip.text = element_text(size = 40),
          legend.text = element_text(size = 20),
          axis.text.x = element_text(size = 20),
          axis.text.y = element_text(size = 20))

  return(p.alldist)
}

#____________________________________________________________________________________
# Plot 3D ordination
MDS_3D <- function(d, name_dist){
  
  # Reset parameters
  mds.3D <- NULL
  xyz <- NULL
  fig.3D <- NULL
  
  # Reduce distance matrix to 3 dimensions
  set.seed(123) # to get the same dimensionality reduction at each run
  mds.3D <- metaMDS(d, method="MDS", k=3, trace = 0)
  xyz <- scores(mds.3D, display="sites") # pull out the x y z coordinates
  
  fig.3D <- plot_ly(x=xyz[,1], y=xyz[,2], z=xyz[,3], type="scatter3d", mode="markers",
                    color=sample_data(physeq.rel)$host_disease, colors = c("blue", "red"))%>%
    #add_trace(x=xyz[,1], y=xyz[,2], z=xyz[,3],
    #       type='scatter3d', mode='text', text = rownames(xyz), textfont = list(color = "black")) %>%
    layout(title = paste('MDS in 3D with', name_dist, 'distance', sep = ' '))
  
  return(fig.3D)
}
# Get the distances
no_glom.dist <- Distances(physeq_obj = physeq.rel)

# Get the 2D ordination plots
no_glom.2D.alldist <- MDS_2D(physeq.rel, no_glom.dist$Ait)
no_glom.2D.alldist + labs(title = 'All samples') + theme(title = element_text(size = 30))

# Get 3D MDS plots
no_glom.3D.UniF <- MDS_3D(no_glom.dist$UniF, 'weighted Unifrac')
no_glom.3D.Ait <- MDS_3D(no_glom.dist$Ait, 'Aitchison')
no_glom.3D.Can <- MDS_3D(no_glom.dist$Canb, 'Canberra')
no_glom.3D.Gower <- MDS_3D(no_glom.dist$Gower, 'Gower')
no_glom.3D.Binomial <- MDS_3D(no_glom.dist$Binomial, 'Binomial')

no_glom.3D.UniF
no_glom.3D.Ait
no_glom.3D.Can
no_glom.3D.Gower
no_glom.3D.Binomial

HIERARCHICAL CLUSTERING

Heatmaps <- function(dist_list, fontsize){
  # Weighted Unifrac
  heatmp.UniF <- pheatmap(as.matrix(dist_list$UniF), 
                          clustering_distance_rows = dist_list$UniF,
                          clustering_distance_cols = dist_list$UniF,
                          fontsize = fontsize,
                          fontsize_col = fontsize-5,
                          fontsize_row = fontsize-5,
                          annotation_col = mat_col,
                          annotation_row = mat_col,
                          annotation_colors = list(group = c('Healthy' = 'blue', 'IBS' = 'red'),
                                                   collection = c('1st' = '#33FF00', '2nd' = '#9933CC')),
                          cluster_rows = T,
                          cluster_cols = T,
                          clustering_method = 'complete', #hierarchical method
                          main = 'Weighted unifrac distance')

  # Aitchison
  heatmp.Ait <- pheatmap(as.matrix(dist_list$Ait), 
                         clustering_distance_rows = dist_list$Ait,
                         clustering_distance_cols = dist_list$Ait,
                         fontsize = fontsize,
                         fontsize_col = fontsize-5,
                         fontsize_row = fontsize-5,
                         # border_color = NA,
                         cluster_rows = T,
                         cluster_cols = T,
                         clustering_method = "complete", #hierarchical method
                         annotation_col = mat_col,
                         annotation_row = mat_col,
                         annotation_colors = list(group = c('Healthy' = 'blue', 'IBS' = 'red'),
                                                   collection = c('1st' = '#33FF00', '2nd' = '#9933CC')),
                         main = "Aitchison distance")
  
    # Canberra
  heatmp.Can <- pheatmap(as.matrix(dist_list$Can), 
                         clustering_distance_rows = dist_list$Can,
                         clustering_distance_cols = dist_list$Can,
                         fontsize = fontsize,
                         fontsize_col = fontsize-5,
                         fontsize_row = fontsize-5,
                         # border_color = NA,
                         cluster_rows = T,
                         cluster_cols = T,
                         clustering_method = "complete", #hierarchical method
                         annotation_col = mat_col,
                         annotation_row = mat_col,
                         annotation_colors = list(group = c('Healthy' = 'blue', 'IBS' = 'red'),
                                                  collection = c('1st' = '#33FF00', '2nd' = '#9933CC')),
                         main = "Canberra distance")
  
    # Gower
  heatmp.Gower <- pheatmap(as.matrix(dist_list$Gower), 
                         clustering_distance_rows = dist_list$Gower,
                         clustering_distance_cols = dist_list$Gower,
                         fontsize = fontsize,
                         fontsize_col = fontsize-5,
                         fontsize_row = fontsize-5,
                         # border_color = NA,
                         cluster_rows = T,
                         cluster_cols = T,
                         clustering_method = "complete", #hierarchical method
                         annotation_col = mat_col,
                         annotation_row = mat_col,
                         annotation_colors = list(group = c('Healthy' = 'blue', 'IBS' = 'red'),
                                                   collection = c('1st' = '#33FF00', '2nd' = '#9933CC')),
                         main = "Gower distance")
  
  # Binomial
  heatmp.Binomial <- pheatmap(as.matrix(dist_list$Binomial), 
                         clustering_distance_rows = dist_list$Binomial,
                         clustering_distance_cols = dist_list$Binomial,
                         fontsize = fontsize,
                         fontsize_col = fontsize-5,
                         fontsize_row = fontsize-5,
                         # border_color = NA,
                         cluster_rows = T,
                         cluster_cols = T,
                         clustering_method = "complete", #hierarchical method
                         annotation_col = mat_col,
                         annotation_row = mat_col,
                         annotation_colors = list(group = c('Healthy' = 'blue', 'IBS' = 'red'),
                                                   collection = c('1st' = '#33FF00', '2nd' = '#9933CC')),
                         main = "Binomial distance")
  
  
  return(list("UniF" = heatmp.UniF, "Ait" = heatmp.Ait, "Can" = heatmp.Can, "Gower" = heatmp.Gower, "Bin" = heatmp.Binomial))
}


# Get the heatmaps
no_glom.heatmaps <- Heatmaps(dist_list = no_glom.dist, fontsize = 8)

REPRODUCE PLOTS FROM PAPER

#_____________________________________
# FIGURE 3

#Families
family.table <- physeq %>%
  tax_glom(taxrank = "Family") %>%                     # agglomerate at family level
  transform_sample_counts(function(x) {x/sum(x)} ) %>% # Transform to rel. abundance
  psmelt()                                             # Melt to long format

ggplot(family.table[family.table[,"Family"] == "Erysipelotrichaceae",],
       aes(x = host_disease, y = Abundance))+
  geom_boxplot()+
  geom_jitter(width = 0.1)+
  labs(x = "",  y = 'Relative Abundance', title = 'Erysipelotrichaceae family')+
  theme_classic()+
  ylim(0,0.02)

ggplot(family.table[family.table[,"Family"] == "Ruminococcaceae",],
       aes(x = host_disease, y = Abundance))+
  geom_boxplot()+
  geom_jitter(width = 0.1)+
  labs(x = "",  y = 'Relative Abundance', title = 'Ruminococcaceae family')+
  theme_classic()